这里直接读取作者给定的第一个病人的Gene expression analysis: discovery patient PBMC,用的是 10x genomics 3’ Chromium expression assay
Following sequence alignment and filtering, a total of 12,874 cells were analyzed.
最后是 17,712 genes and 12,874 cells
需要自行下载安装一些必要的R包! 而且需要注意版本 Seurat
因为大量学员在中国大陆,通常不建议大家使用下面的R包安装方法,建议是切换镜像后再下载R包。
参考:http://www.bio-info-trainee.com/3727.html
if (!requireNamespace("BiocManager"))
install.packages("BiocManager")
if (!requireNamespace("Seurat"))
BiocManager::install("Seurat")
加载R包
rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))
start_time <- Sys.time()
raw_dataPBMC <- read.csv('../Output_2018-03-12/GSE117988_raw.expMatrix_PBMC.csv.gz', header = TRUE, row.names = 1)
end_time <- Sys.time()
end_time - start_time # for 2.7 GHz i5, 8G RAM, total = 4min
## Time difference of 1.482228 mins
dim(raw_dataPBMC)
## [1] 17712 12874
start_time <- Sys.time()
dataPBMC <- log2(1 + sweep(raw_dataPBMC, 2, median(colSums(raw_dataPBMC))/colSums(raw_dataPBMC), '*')) # Normalization
end_time <- Sys.time()
end_time - start_time
## Time difference of 42.03681 secs
# 3.0版本取消了ExtractField
# 版本2 timePoints <- sapply(colnames(dataPBMC), function(x) ExtractField(x, 2, '[.]'))
timePoints <- sapply(colnames(dataPBMC), function(x) unlist(strsplit(x, "\\."))[2])
timePoints <-ifelse(timePoints == '1', 'PBMC_Pre',
ifelse(timePoints == '2', 'PBMC_EarlyD27',
ifelse(timePoints == '3', 'PBMC_RespD376', 'PBMC_ARD614')))
table(timePoints)
## timePoints
## PBMC_ARD614 PBMC_EarlyD27 PBMC_Pre PBMC_RespD376
## 4516 1592 2082 4684
简单看看表达矩阵的性质,主要是基因数量,细胞数量;以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。
fivenum(apply(dataPBMC,1,function(x) sum(x>0) ))
## RP4-669L17.10 LAMB3 NAT10 AC093673.5 RPL21
## 1 6 50 207 12102
boxplot(apply(dataPBMC,1,function(x) sum(x>0) ))
fivenum(apply(dataPBMC,2,function(x) sum(x>0) ))
## CAAGAAATCGATCCCT.2 GGCCGATTCCAGAGGA.3 TCAACGAAGAGCTGGT.3
## 10 321 395
## TGCCAAAGTCTGCGGT.4 TCTGGAATCTATCGCC.3
## 481 2865
hist(apply(dataPBMC,2,function(x) sum(x>0) ))
start_time <- Sys.time()
# 3.0版本Create Seurat object稍作改进
PBMC <- CreateSeuratObject(dataPBMC,
min.cells = 1, min.features = 0,
project = '10x_PBMC') # already normalized
PBMC # 17,712 genes and 12,874 cells
## An object of class Seurat
## 17712 features across 12874 samples within 1 assay
## Active assay: RNA (17712 features)
end_time <- Sys.time()
end_time - start_time
## Time difference of 2.670036 secs
# Add meta.data (nUMI and timePoints)
# 3.0版本可以直接使用 object$name <- vector,当然也可以用AddMetaData
PBMC <- AddMetaData(object = PBMC,
metadata = apply(raw_dataPBMC, 2, sum),
col.name = 'nUMI_raw')
PBMC <- AddMetaData(object = PBMC, metadata = timePoints, col.name = 'TimePoints')
这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面,我们创建对象后使用函数添加了 TimePoints 属性,所以可以用来进行可视化。
这里是:‘TimePoints’
sce=PBMC
features=c("nFeature_RNA", "nUMI_raw")
VlnPlot(object = sce,
features = features,
group.by = 'TimePoints', ncol = 2)
# 3.0版本将GenePlot替换为FeatureScatter
# 版本2 GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")
FeatureScatter(sce,feature1 = "nUMI_raw",feature2 = "nFeature_RNA")
可以看看高表达量基因是哪些
# 3.0版本要将sce@raw.data替换成GetAssayData(object = , assay= ,slot = )
tail(sort(Matrix::rowSums(GetAssayData(sce,assay = "RNA"))))
## EEF1A1 RPL13A RPLP1 TMSB4X B2M MALAT1
## 37203.12 38345.11 38977.99 43027.61 46854.95 62363.12
## 散点图可视化任意两个基因的一些属性(通常是细胞的度量)
# 这里选取两个基因。
tmp=names(sort(Matrix::rowSums(GetAssayData(sce,assay = "RNA")),decreasing = T))
# 版本2 GenePlot(object = sce, gene1 = tmp[1], gene2 = tmp[2])
FeatureScatter(object = sce, feature1 = tmp[1], feature2 = tmp[2])
# 散点图可视化任意两个细胞的一些属性(通常是基因的度量)
# 这里选取两个细胞
# 3.0版本将CellPlot替换成CellScatter,sce@cell.names换为colnames
# 版本2 CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)
CellScatter(sce, colnames(sce)[3],colnames(sce)[4])
This process consists of data normalization and variable feature selection, data scaling, a PCA on variable features, construction of a shared-nearest-neighbors graph, and clustering using a modularity optimizer. Finally, we use a t-SNE to visualize our clusters in a two-dimensional space.
start_time <- Sys.time()
# Cluster PBMC
PBMC <- ScaleData(object = PBMC, vars.to.regress = c('nUMI_raw'), model.use = 'linear', use.umi = FALSE)
# 3.0版本将FindVariableGenes换为FindVariableFeatures,另外将原来的cutoff进行整合,x轴统一归到mean.cutoff中,y轴归到dispersion.cutoff中
# 版本2 PBMC <- FindVariableGenes(object = PBMC, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
PBMC <- FindVariableFeatures(object = PBMC, mean.function = ExpMean, dispersion.function = LogVMR, mean.cutoff = c(0.0125,3), dispersion.cutoff = c(0.5,Inf))
PBMC <- RunPCA(object = PBMC, pc.genes = VariableFeatures(PBMC))
## PC_ 1
## Positive: FTL, FTH1, S100A9, GPX1, CST3, LYZ, S100A8, SAT1, AIF1, PPBP
## LST1, FCN1, CTSS, CSTA, PF4, S100A12, GNG11, TYROBP, PSAP, NRGN
## SDPR, VCAN, COTL1, LGALS2, S100A6, RP11-1143G9.4, MAP3K7CL, NEAT1, HIST1H2AC, RGS18
## Negative: NKG7, IL32, GNLY, GZMA, HLA-A, UBB, CST7, MALAT1, GZMH, GZMB
## CTSW, FGFBP2, KLRB1, CD7, BTG1, HOPX, CMC1, CD247, LTB, KLRD1
## PRF1, CD3D, CXCR4, GZMM, TRBC2, TRBC1, TRAC, TRDC, CCL5, HBA1
## PC_ 2
## Positive: S100A9, LYZ, S100A8, AIF1, FCN1, CTSS, LST1, S100A6, TYROBP, FTL
## CSTA, NEAT1, S100A11, S100A12, VCAN, LGALS2, RP11-1143G9.4, CST3, TYMP, PSAP
## LGALS1, SERPINA1, DUSP1, S100A4, FOS, MNDA, VIM, MAFB, CD14, MS4A6A
## Negative: PF4, GNG11, SDPR, HIST1H2AC, TUBB1, PPBP, ACRBP, RGS18, CLU, NRGN
## GP9, SPARC, MAP3K7CL, NGFRAP1, NCOA4, TSC22D1, TREML1, MMD, PTCRA, RUFY1
## PGRMC1, CLEC1B, C6orf25, HIST1H3H, CMTM5, ITGA2B, TMEM40, MYL9, TUBA4A, CCL5
## PC_ 3
## Positive: HBB, HBA1, HBA2, ALAS2, CD79A, AHSP, HBD, SNCA, IGKC, IGHD
## SLC25A37, IGHM, HLA-DRA, TCL1A, CA1, CD79B, HBM, CD74, MS4A1, IGLC2
## HLA-DQA1, SLC25A39, VPREB3, LTB, BLVRB, HLA-DRB1, BANK1, BPGM, LINC00926, FCER2
## Negative: NKG7, B2M, CCL5, CST7, GNLY, GZMB, HLA-A, GZMA, FGFBP2, S100A4
## MALAT1, HCST, GZMH, CTSW, ACTB, IFITM2, PRF1, KLRB1, KLRD1, ACTG1
## CD247, GAPDH, CMC1, CD7, HOPX, KLRF1, SPON2, SRGN, TRDC, FCGR3A
## PC_ 4
## Positive: HBA1, HBA2, HBB, ALAS2, AHSP, HBD, SNCA, SLC25A37, CA1, HBM
## BLVRB, UBB, SLC25A39, BPGM, MPP1, GMPR, HBQ1, S100A8, PDZK1IP1, S100A9
## S100A12, NCOA4, LYZ, VCAN, IFI27, RP11-1143G9.4, TYROBP, MYL4, FOS, CSTA
## Negative: CD74, CD79A, MALAT1, HLA-DRA, HLA-DPB1, CD37, IGHD, HLA-DRB1, BTG1, IGKC
## CD79B, HLA-DPA1, HLA-DQA1, IGHM, TCL1A, CXCR4, B2M, CD52, MS4A1, IGLC2
## LTB, HLA-DQB1, VPREB3, BANK1, PLPP5, FCER2, LINC00926, IGLC3, CD69, HLA-DOB
## PC_ 5
## Positive: TRAC, IL7R, CD3D, IL32, LTB, GZMK, CD52, TRBC2, VIM, CD8A
## DUSP2, TRBC1, JUNB, MALAT1, B2M, S100A4, GAPDH, KLRG1, S100A12, TRGC2
## HLA-A, S100A6, S100A8, VCAN, CXCR4, RP11-1143G9.4, BTG1, FOS, S100A9, MIAT
## Negative: FCGR3A, HLA-DRA, GZMB, HLA-DPA1, HLA-DPB1, CD74, UBB, FGFBP2, HLA-DRB1, SPON2
## TYROBP, FCER1G, PRF1, KLRF1, HLA-DQA1, NKG7, CD79A, CD79B, RHOC, HBA1
## CST7, HBA2, ALAS2, HBB, HBD, IGHD, AHSP, IFITM3, CLIC3, IGFBP7
## 避免太多log日志被打印出来。
# 3.0版本将FindClusters拆分为FindNeighbors和FindClusters
# 版本2是一个函数
# PBMC <- FindClusters(object = PBMC,
# reduction.type = "pca",
# dims.use = 1:10,
# resolution = 1,
# print.output = 0,
# k.param = 35, save.SNN = TRUE) # 13 clusters
PBMC <- FindNeighbors(PBMC, reduction = "pca", dims = 1:10,
k.param = 35)
PBMC <- FindClusters(object = PBMC,
resolution = 1, verbose=F)
PBMC <- RunTSNE(object = PBMC, dims.use = 1:10)
DimPlot(PBMC, colors = c('green4', 'pink', '#FF7F00', 'orchid', '#99c9fb', 'dodgerblue2', 'grey30', 'yellow', 'grey60', 'grey', 'red', '#FB9A99', 'black'))
end_time <- Sys.time()
end_time - start_time # for 2.7 GHz i5, 8G RAM, total = 1.2h
## Time difference of 11.88596 mins
save(PBMC,file = 'patient1.PBMC.output.Rdata')
# 这个步骤输出文件 1.75G, 遂放弃!
最后,这 13 clusters要进行注释,才能发表,如下所示:
作者文章里面是Representative marker genes shown in Supplementary Fig. 7. 如下所示
可以看到作者对PBMC里面的细胞都挑选了一个基因就命名了。
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Seurat_3.0.2
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 tsne_0.1-3 bitops_1.0-6
## [4] RColorBrewer_1.1-2 httr_1.4.0 sctransform_0.2.0
## [7] tools_3.5.3 R6_2.4.0 irlba_2.3.3
## [10] KernSmooth_2.23-15 lazyeval_0.2.2 colorspace_1.4-1
## [13] npsurv_0.4-0 withr_2.1.2 tidyselect_0.2.5
## [16] gridExtra_2.3 compiler_3.5.3 plotly_4.9.0
## [19] labeling_0.3 caTools_1.17.1.2 scales_1.0.0
## [22] lmtest_0.9-37 ggridges_0.5.1 pbapply_1.4-0
## [25] stringr_1.4.0 digest_0.6.19 rmarkdown_1.12
## [28] R.utils_2.8.0 pkgconfig_2.0.2 htmltools_0.3.6
## [31] bibtex_0.4.2 htmlwidgets_1.3 rlang_0.3.4
## [34] zoo_1.8-5 jsonlite_1.6 ica_1.0-2
## [37] gtools_3.8.1 dplyr_0.8.1 R.oo_1.22.0
## [40] magrittr_1.5 Matrix_1.2-15 Rcpp_1.0.1
## [43] munsell_0.5.0 ape_5.3 reticulate_1.12
## [46] R.methodsS3_1.7.1 stringi_1.4.3 yaml_2.2.0
## [49] gbRd_0.4-11 MASS_7.3-51.1 gplots_3.0.1.1
## [52] Rtsne_0.15 plyr_1.8.4 grid_3.5.3
## [55] parallel_3.5.3 gdata_2.18.0 listenv_0.7.0
## [58] ggrepel_0.8.0 crayon_1.3.4 lattice_0.20-38
## [61] cowplot_0.9.4 splines_3.5.3 SDMTools_1.1-221.1
## [64] knitr_1.22 pillar_1.3.1 igraph_1.2.4
## [67] future.apply_1.3.0 reshape2_1.4.3 codetools_0.2-16
## [70] glue_1.3.1 evaluate_0.13 lsei_1.2-0
## [73] metap_1.1 data.table_1.12.0 BiocManager_1.30.4
## [76] png_0.1-7 Rdpack_0.11-0 gtable_0.3.0
## [79] RANN_2.6.1 purrr_0.3.2 tidyr_0.8.3
## [82] future_1.14.0 assertthat_0.2.1 ggplot2_3.1.0
## [85] xfun_0.8 rsvd_1.0.1 survival_2.43-3
## [88] viridisLite_0.3.0 tibble_2.1.1 cluster_2.0.7-1
## [91] globals_0.12.4 fitdistrplus_1.0-14 ROCR_1.0-7